Home Reference Source

src/remux/mp4-remuxer.ts

  1. import AAC from './aac-helper';
  2. import MP4 from './mp4-generator';
  3. import type { HlsEventEmitter } from '../events';
  4. import { Events } from '../events';
  5. import { ErrorTypes, ErrorDetails } from '../errors';
  6. import { logger } from '../utils/logger';
  7. import {
  8. InitSegmentData,
  9. Remuxer,
  10. RemuxerResult,
  11. RemuxedMetadata,
  12. RemuxedTrack,
  13. RemuxedUserdata,
  14. } from '../types/remuxer';
  15. import type {
  16. AudioSample,
  17. AvcSample,
  18. DemuxedAudioTrack,
  19. DemuxedAvcTrack,
  20. DemuxedMetadataTrack,
  21. DemuxedUserdataTrack,
  22. } from '../types/demuxer';
  23. import type { TrackSet } from '../types/track';
  24. import type { SourceBufferName } from '../types/buffer';
  25. import type { Fragment } from '../loader/fragment';
  26. import type { HlsConfig } from '../config';
  27. import { toMsFromMpegTsClock } from '../utils/timescale-conversion';
  28.  
  29. const MAX_SILENT_FRAME_DURATION = 10 * 1000; // 10 seconds
  30. const AAC_SAMPLES_PER_FRAME = 1024;
  31. const MPEG_AUDIO_SAMPLE_PER_FRAME = 1152;
  32.  
  33. let chromeVersion: number | null = null;
  34. let safariWebkitVersion: number | null = null;
  35. let requiresPositiveDts: boolean = false;
  36.  
  37. export default class MP4Remuxer implements Remuxer {
  38. private observer: HlsEventEmitter;
  39. private config: HlsConfig;
  40. private typeSupported: any;
  41. private ISGenerated: boolean = false;
  42. private _initPTS!: number;
  43. private _initDTS!: number;
  44. private nextAvcDts: number | null = null;
  45. private nextAudioPts: number | null = null;
  46. private isAudioContiguous: boolean = false;
  47. private isVideoContiguous: boolean = false;
  48.  
  49. constructor(
  50. observer: HlsEventEmitter,
  51. config: HlsConfig,
  52. typeSupported,
  53. vendor = ''
  54. ) {
  55. this.observer = observer;
  56. this.config = config;
  57. this.typeSupported = typeSupported;
  58. this.ISGenerated = false;
  59.  
  60. if (chromeVersion === null) {
  61. const userAgent = navigator.userAgent || '';
  62. const result = userAgent.match(/Chrome\/(\d+)/i);
  63. chromeVersion = result ? parseInt(result[1]) : 0;
  64. }
  65. if (safariWebkitVersion === null) {
  66. const result = navigator.userAgent.match(/Safari\/(\d+)/i);
  67. safariWebkitVersion = result ? parseInt(result[1]) : 0;
  68. }
  69. requiresPositiveDts =
  70. (!!chromeVersion && chromeVersion < 75) ||
  71. (!!safariWebkitVersion && safariWebkitVersion < 600);
  72. }
  73.  
  74. destroy() {}
  75.  
  76. resetTimeStamp(defaultTimeStamp) {
  77. logger.log('[mp4-remuxer]: initPTS & initDTS reset');
  78. this._initPTS = this._initDTS = defaultTimeStamp;
  79. }
  80.  
  81. resetNextTimestamp() {
  82. logger.log('[mp4-remuxer]: reset next timestamp');
  83. this.isVideoContiguous = false;
  84. this.isAudioContiguous = false;
  85. }
  86.  
  87. resetInitSegment() {
  88. logger.log('[mp4-remuxer]: ISGenerated flag reset');
  89. this.ISGenerated = false;
  90. }
  91.  
  92. getVideoStartPts(videoSamples) {
  93. let rolloverDetected = false;
  94. const startPTS = videoSamples.reduce((minPTS, sample) => {
  95. const delta = sample.pts - minPTS;
  96. if (delta < -4294967296) {
  97. // 2^32, see PTSNormalize for reasoning, but we're hitting a rollover here, and we don't want that to impact the timeOffset calculation
  98. rolloverDetected = true;
  99. return normalizePts(minPTS, sample.pts);
  100. } else if (delta > 0) {
  101. return minPTS;
  102. } else {
  103. return sample.pts;
  104. }
  105. }, videoSamples[0].pts);
  106. if (rolloverDetected) {
  107. logger.debug('PTS rollover detected');
  108. }
  109. return startPTS;
  110. }
  111.  
  112. remux(
  113. audioTrack: DemuxedAudioTrack,
  114. videoTrack: DemuxedAvcTrack,
  115. id3Track: DemuxedMetadataTrack,
  116. textTrack: DemuxedUserdataTrack,
  117. timeOffset: number,
  118. accurateTimeOffset: boolean,
  119. flush: boolean
  120. ): RemuxerResult {
  121. let video;
  122. let audio;
  123. let initSegment;
  124. let text;
  125. let id3;
  126. let independent: boolean | undefined;
  127. let audioTimeOffset = timeOffset;
  128. let videoTimeOffset = timeOffset;
  129.  
  130. // If we're remuxing audio and video progressively, wait until we've received enough samples for each track before proceeding.
  131. // This is done to synchronize the audio and video streams. We know if the current segment will have samples if the "pid"
  132. // parameter is greater than -1. The pid is set when the PMT is parsed, which contains the tracks list.
  133. // However, if the initSegment has already been generated, or we've reached the end of a segment (flush),
  134. // then we can remux one track without waiting for the other.
  135. const hasAudio = audioTrack.pid > -1;
  136. const hasVideo = videoTrack.pid > -1;
  137. const enoughAudioSamples = audioTrack.samples.length > 0;
  138. const enoughVideoSamples = videoTrack.samples.length > 1;
  139. const canRemuxAvc =
  140. ((!hasAudio || enoughAudioSamples) &&
  141. (!hasVideo || enoughVideoSamples)) ||
  142. this.ISGenerated ||
  143. flush;
  144.  
  145. if (canRemuxAvc) {
  146. if (!this.ISGenerated) {
  147. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  148. }
  149.  
  150. const isVideoContiguous = this.isVideoContiguous;
  151. if (
  152. enoughVideoSamples &&
  153. !isVideoContiguous &&
  154. this.config.forceKeyFrameOnDiscontinuity
  155. ) {
  156. const length = videoTrack.samples.length;
  157. const firstKeyFrameIndex = findKeyframeIndex(videoTrack.samples);
  158. independent = true;
  159. if (firstKeyFrameIndex > 0) {
  160. logger.warn(
  161. `[mp4-remuxer]: Dropped ${firstKeyFrameIndex} out of ${length} video samples due to a missing keyframe`
  162. );
  163. const startPTS = this.getVideoStartPts(videoTrack.samples);
  164. videoTrack.samples = videoTrack.samples.slice(firstKeyFrameIndex);
  165. videoTrack.dropped += firstKeyFrameIndex;
  166. videoTimeOffset +=
  167. (videoTrack.samples[0].pts - startPTS) /
  168. (videoTrack.timescale || 90000);
  169. } else if (firstKeyFrameIndex === -1) {
  170. logger.warn(
  171. `[mp4-remuxer]: No keyframe found out of ${length} video samples`
  172. );
  173. independent = false;
  174. }
  175. }
  176.  
  177. if (this.ISGenerated) {
  178. if (enoughAudioSamples && enoughVideoSamples) {
  179. // timeOffset is expected to be the offset of the first timestamp of this fragment (first DTS)
  180. // if first audio DTS is not aligned with first video DTS then we need to take that into account
  181. // when providing timeOffset to remuxAudio / remuxVideo. if we don't do that, there might be a permanent / small
  182. // drift between audio and video streams
  183. const startPTS = this.getVideoStartPts(videoTrack.samples);
  184. const tsDelta =
  185. normalizePts(audioTrack.samples[0].pts, startPTS) - startPTS;
  186. const audiovideoTimestampDelta = tsDelta / videoTrack.inputTimeScale;
  187. audioTimeOffset += Math.max(0, audiovideoTimestampDelta);
  188. videoTimeOffset += Math.max(0, -audiovideoTimestampDelta);
  189. }
  190.  
  191. // Purposefully remuxing audio before video, so that remuxVideo can use nextAudioPts, which is calculated in remuxAudio.
  192. if (enoughAudioSamples) {
  193. // if initSegment was generated without audio samples, regenerate it again
  194. if (!audioTrack.samplerate) {
  195. logger.warn(
  196. '[mp4-remuxer]: regenerate InitSegment as audio detected'
  197. );
  198. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  199. delete initSegment.video;
  200. }
  201. audio = this.remuxAudio(
  202. audioTrack,
  203. audioTimeOffset,
  204. this.isAudioContiguous,
  205. accurateTimeOffset,
  206. enoughVideoSamples ? videoTimeOffset : undefined
  207. );
  208. if (enoughVideoSamples) {
  209. const audioTrackLength = audio ? audio.endPTS - audio.startPTS : 0;
  210. // if initSegment was generated without video samples, regenerate it again
  211. if (!videoTrack.inputTimeScale) {
  212. logger.warn(
  213. '[mp4-remuxer]: regenerate InitSegment as video detected'
  214. );
  215. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  216. }
  217. video = this.remuxVideo(
  218. videoTrack,
  219. videoTimeOffset,
  220. isVideoContiguous,
  221. audioTrackLength
  222. );
  223. }
  224. } else if (enoughVideoSamples) {
  225. video = this.remuxVideo(
  226. videoTrack,
  227. videoTimeOffset,
  228. isVideoContiguous,
  229. 0
  230. );
  231. }
  232. if (video && independent !== undefined) {
  233. video.independent = independent;
  234. }
  235. }
  236. }
  237.  
  238. // Allow ID3 and text to remux, even if more audio/video samples are required
  239. if (this.ISGenerated) {
  240. if (id3Track.samples.length) {
  241. id3 = this.remuxID3(id3Track, timeOffset);
  242. }
  243.  
  244. if (textTrack.samples.length) {
  245. text = this.remuxText(textTrack, timeOffset);
  246. }
  247. }
  248.  
  249. return {
  250. audio,
  251. video,
  252. initSegment,
  253. independent,
  254. text,
  255. id3,
  256. };
  257. }
  258.  
  259. generateIS(
  260. audioTrack: DemuxedAudioTrack,
  261. videoTrack: DemuxedAvcTrack,
  262. timeOffset
  263. ): InitSegmentData | undefined {
  264. const audioSamples = audioTrack.samples;
  265. const videoSamples = videoTrack.samples;
  266. const typeSupported = this.typeSupported;
  267. const tracks: TrackSet = {};
  268. const computePTSDTS = !Number.isFinite(this._initPTS);
  269. let container = 'audio/mp4';
  270. let initPTS: number | undefined;
  271. let initDTS: number | undefined;
  272. let timescale: number | undefined;
  273.  
  274. if (computePTSDTS) {
  275. initPTS = initDTS = Infinity;
  276. }
  277.  
  278. if (audioTrack.config && audioSamples.length) {
  279. // let's use audio sampling rate as MP4 time scale.
  280. // rationale is that there is a integer nb of audio frames per audio sample (1024 for AAC)
  281. // using audio sampling rate here helps having an integer MP4 frame duration
  282. // this avoids potential rounding issue and AV sync issue
  283. audioTrack.timescale = audioTrack.samplerate;
  284. if (!audioTrack.isAAC) {
  285. if (typeSupported.mpeg) {
  286. // Chrome and Safari
  287. container = 'audio/mpeg';
  288. audioTrack.codec = '';
  289. } else if (typeSupported.mp3) {
  290. // Firefox
  291. audioTrack.codec = 'mp3';
  292. }
  293. }
  294. tracks.audio = {
  295. id: 'audio',
  296. container: container,
  297. codec: audioTrack.codec,
  298. initSegment:
  299. !audioTrack.isAAC && typeSupported.mpeg
  300. ? new Uint8Array(0)
  301. : MP4.initSegment([audioTrack]),
  302. metadata: {
  303. channelCount: audioTrack.channelCount,
  304. },
  305. };
  306. if (computePTSDTS) {
  307. timescale = audioTrack.inputTimeScale;
  308. // remember first PTS of this demuxing context. for audio, PTS = DTS
  309. initPTS = initDTS =
  310. audioSamples[0].pts - Math.round(timescale * timeOffset);
  311. }
  312. }
  313.  
  314. if (videoTrack.sps && videoTrack.pps && videoSamples.length) {
  315. // let's use input time scale as MP4 video timescale
  316. // we use input time scale straight away to avoid rounding issues on frame duration / cts computation
  317. videoTrack.timescale = videoTrack.inputTimeScale;
  318. tracks.video = {
  319. id: 'main',
  320. container: 'video/mp4',
  321. codec: videoTrack.codec,
  322. initSegment: MP4.initSegment([videoTrack]),
  323. metadata: {
  324. width: videoTrack.width,
  325. height: videoTrack.height,
  326. },
  327. };
  328. if (computePTSDTS) {
  329. timescale = videoTrack.inputTimeScale;
  330. const startPTS = this.getVideoStartPts(videoSamples);
  331. const startOffset = Math.round(timescale * timeOffset);
  332. initDTS = Math.min(
  333. initDTS as number,
  334. normalizePts(videoSamples[0].dts, startPTS) - startOffset
  335. );
  336. initPTS = Math.min(initPTS as number, startPTS - startOffset);
  337. }
  338. }
  339.  
  340. if (Object.keys(tracks).length) {
  341. this.ISGenerated = true;
  342. if (computePTSDTS) {
  343. this._initPTS = initPTS as number;
  344. this._initDTS = initDTS as number;
  345. }
  346.  
  347. return {
  348. tracks,
  349. initPTS,
  350. timescale,
  351. };
  352. }
  353. }
  354.  
  355. remuxVideo(
  356. track: DemuxedAvcTrack,
  357. timeOffset: number,
  358. contiguous: boolean,
  359. audioTrackLength: number
  360. ): RemuxedTrack | undefined {
  361. const timeScale: number = track.inputTimeScale;
  362. const inputSamples: Array<AvcSample> = track.samples;
  363. const outputSamples: Array<Mp4Sample> = [];
  364. const nbSamples: number = inputSamples.length;
  365. const initPTS: number = this._initPTS;
  366. let nextAvcDts = this.nextAvcDts;
  367. let offset = 8;
  368. let mp4SampleDuration!: number;
  369. let firstDTS;
  370. let lastDTS;
  371. let minPTS: number = Number.POSITIVE_INFINITY;
  372. let maxPTS: number = Number.NEGATIVE_INFINITY;
  373. let ptsDtsShift = 0;
  374. let sortSamples = false;
  375.  
  376. // if parsed fragment is contiguous with last one, let's use last DTS value as reference
  377. if (!contiguous || nextAvcDts === null) {
  378. const pts = timeOffset * timeScale;
  379. const cts =
  380. inputSamples[0].pts -
  381. normalizePts(inputSamples[0].dts, inputSamples[0].pts);
  382. // if not contiguous, let's use target timeOffset
  383. nextAvcDts = pts - cts;
  384. }
  385.  
  386. // PTS is coded on 33bits, and can loop from -2^32 to 2^32
  387. // PTSNormalize will make PTS/DTS value monotonic, we use last known DTS value as reference value
  388. for (let i = 0; i < nbSamples; i++) {
  389. const sample = inputSamples[i];
  390. sample.pts = normalizePts(sample.pts - initPTS, nextAvcDts);
  391. sample.dts = normalizePts(sample.dts - initPTS, nextAvcDts);
  392. if (sample.dts > sample.pts) {
  393. const PTS_DTS_SHIFT_TOLERANCE_90KHZ = 90000 * 0.2;
  394. ptsDtsShift = Math.max(
  395. Math.min(ptsDtsShift, sample.pts - sample.dts),
  396. -1 * PTS_DTS_SHIFT_TOLERANCE_90KHZ
  397. );
  398. }
  399. if (sample.dts < inputSamples[i > 0 ? i - 1 : i].dts) {
  400. sortSamples = true;
  401. }
  402. }
  403.  
  404. // sort video samples by DTS then PTS then demux id order
  405. if (sortSamples) {
  406. inputSamples.sort(function (a, b) {
  407. const deltadts = a.dts - b.dts;
  408. const deltapts = a.pts - b.pts;
  409. return deltadts || deltapts;
  410. });
  411. }
  412.  
  413. // Get first/last DTS
  414. firstDTS = inputSamples[0].dts;
  415. lastDTS = inputSamples[inputSamples.length - 1].dts;
  416.  
  417. // on Safari let's signal the same sample duration for all samples
  418. // sample duration (as expected by trun MP4 boxes), should be the delta between sample DTS
  419. // set this constant duration as being the avg delta between consecutive DTS.
  420. const averageSampleDuration = Math.round(
  421. (lastDTS - firstDTS) / (nbSamples - 1)
  422. );
  423.  
  424. // handle broken streams with PTS < DTS, tolerance up 0.2 seconds
  425. if (ptsDtsShift < 0) {
  426. if (ptsDtsShift < averageSampleDuration * -2) {
  427. // Fix for "CNN special report, with CC" in test-streams (including Safari browser)
  428. // With large PTS < DTS errors such as this, we want to correct CTS while maintaining increasing DTS values
  429. logger.warn(
  430. `PTS < DTS detected in video samples, offsetting DTS from PTS by ${toMsFromMpegTsClock(
  431. -averageSampleDuration,
  432. true
  433. )} ms`
  434. );
  435. let lastDts = ptsDtsShift;
  436. for (let i = 0; i < nbSamples; i++) {
  437. inputSamples[i].dts = lastDts = Math.max(
  438. lastDts,
  439. inputSamples[i].pts - averageSampleDuration
  440. );
  441. inputSamples[i].pts = Math.max(lastDts, inputSamples[i].pts);
  442. }
  443. } else {
  444. // Fix for "Custom IV with bad PTS DTS" in test-streams
  445. // With smaller PTS < DTS errors we can simply move all DTS back. This increases CTS without causing buffer gaps or decode errors in Safari
  446. logger.warn(
  447. `PTS < DTS detected in video samples, shifting DTS by ${toMsFromMpegTsClock(
  448. ptsDtsShift,
  449. true
  450. )} ms to overcome this issue`
  451. );
  452. for (let i = 0; i < nbSamples; i++) {
  453. inputSamples[i].dts = inputSamples[i].dts + ptsDtsShift;
  454. }
  455. }
  456. firstDTS = inputSamples[0].dts;
  457. }
  458.  
  459. // if fragment are contiguous, detect hole/overlapping between fragments
  460. if (contiguous) {
  461. // check timestamp continuity across consecutive fragments (this is to remove inter-fragment gap/hole)
  462. const delta = firstDTS - nextAvcDts;
  463. const foundHole = delta > averageSampleDuration;
  464. const foundOverlap = delta < -1;
  465. if (foundHole || foundOverlap) {
  466. if (foundHole) {
  467. logger.warn(
  468. `AVC: ${toMsFromMpegTsClock(
  469. delta,
  470. true
  471. )} ms (${delta}dts) hole between fragments detected, filling it`
  472. );
  473. } else {
  474. logger.warn(
  475. `AVC: ${toMsFromMpegTsClock(
  476. -delta,
  477. true
  478. )} ms (${delta}dts) overlapping between fragments detected`
  479. );
  480. }
  481. firstDTS = nextAvcDts;
  482. const firstPTS = inputSamples[0].pts - delta;
  483. inputSamples[0].dts = firstDTS;
  484. inputSamples[0].pts = firstPTS;
  485. logger.log(
  486. `Video: First PTS/DTS adjusted: ${toMsFromMpegTsClock(
  487. firstPTS,
  488. true
  489. )}/${toMsFromMpegTsClock(
  490. firstDTS,
  491. true
  492. )}, delta: ${toMsFromMpegTsClock(delta, true)} ms`
  493. );
  494. }
  495. }
  496.  
  497. if (requiresPositiveDts) {
  498. firstDTS = Math.max(0, firstDTS);
  499. }
  500. let nbNalu = 0;
  501. let naluLen = 0;
  502. for (let i = 0; i < nbSamples; i++) {
  503. // compute total/avc sample length and nb of NAL units
  504. const sample = inputSamples[i];
  505. const units = sample.units;
  506. const nbUnits = units.length;
  507. let sampleLen = 0;
  508. for (let j = 0; j < nbUnits; j++) {
  509. sampleLen += units[j].data.length;
  510. }
  511.  
  512. naluLen += sampleLen;
  513. nbNalu += nbUnits;
  514. sample.length = sampleLen;
  515.  
  516. // normalize PTS/DTS
  517. // ensure sample monotonic DTS
  518. sample.dts = Math.max(sample.dts, firstDTS);
  519. // ensure that computed value is greater or equal than sample DTS
  520. sample.pts = Math.max(sample.pts, sample.dts, 0);
  521. minPTS = Math.min(sample.pts, minPTS);
  522. maxPTS = Math.max(sample.pts, maxPTS);
  523. }
  524. lastDTS = inputSamples[nbSamples - 1].dts;
  525.  
  526. /* concatenate the video data and construct the mdat in place
  527. (need 8 more bytes to fill length and mpdat type) */
  528. const mdatSize = naluLen + 4 * nbNalu + 8;
  529. let mdat;
  530. try {
  531. mdat = new Uint8Array(mdatSize);
  532. } catch (err) {
  533. this.observer.emit(Events.ERROR, Events.ERROR, {
  534. type: ErrorTypes.MUX_ERROR,
  535. details: ErrorDetails.REMUX_ALLOC_ERROR,
  536. fatal: false,
  537. bytes: mdatSize,
  538. reason: `fail allocating video mdat ${mdatSize}`,
  539. });
  540. return;
  541. }
  542. const view = new DataView(mdat.buffer);
  543. view.setUint32(0, mdatSize);
  544. mdat.set(MP4.types.mdat, 4);
  545.  
  546. for (let i = 0; i < nbSamples; i++) {
  547. const avcSample = inputSamples[i];
  548. const avcSampleUnits = avcSample.units;
  549. let mp4SampleLength = 0;
  550. // convert NALU bitstream to MP4 format (prepend NALU with size field)
  551. for (let j = 0, nbUnits = avcSampleUnits.length; j < nbUnits; j++) {
  552. const unit = avcSampleUnits[j];
  553. const unitData = unit.data;
  554. const unitDataLen = unit.data.byteLength;
  555. view.setUint32(offset, unitDataLen);
  556. offset += 4;
  557. mdat.set(unitData, offset);
  558. offset += unitDataLen;
  559. mp4SampleLength += 4 + unitDataLen;
  560. }
  561.  
  562. // expected sample duration is the Decoding Timestamp diff of consecutive samples
  563. if (i < nbSamples - 1) {
  564. mp4SampleDuration = inputSamples[i + 1].dts - avcSample.dts;
  565. } else {
  566. const config = this.config;
  567. const lastFrameDuration =
  568. avcSample.dts - inputSamples[i > 0 ? i - 1 : i].dts;
  569. if (config.stretchShortVideoTrack && this.nextAudioPts !== null) {
  570. // In some cases, a segment's audio track duration may exceed the video track duration.
  571. // Since we've already remuxed audio, and we know how long the audio track is, we look to
  572. // see if the delta to the next segment is longer than maxBufferHole.
  573. // If so, playback would potentially get stuck, so we artificially inflate
  574. // the duration of the last frame to minimize any potential gap between segments.
  575. const gapTolerance = Math.floor(config.maxBufferHole * timeScale);
  576. const deltaToFrameEnd =
  577. (audioTrackLength
  578. ? minPTS + audioTrackLength * timeScale
  579. : this.nextAudioPts) - avcSample.pts;
  580. if (deltaToFrameEnd > gapTolerance) {
  581. // We subtract lastFrameDuration from deltaToFrameEnd to try to prevent any video
  582. // frame overlap. maxBufferHole should be >> lastFrameDuration anyway.
  583. mp4SampleDuration = deltaToFrameEnd - lastFrameDuration;
  584. if (mp4SampleDuration < 0) {
  585. mp4SampleDuration = lastFrameDuration;
  586. }
  587. logger.log(
  588. `[mp4-remuxer]: It is approximately ${
  589. deltaToFrameEnd / 90
  590. } ms to the next segment; using duration ${
  591. mp4SampleDuration / 90
  592. } ms for the last video frame.`
  593. );
  594. } else {
  595. mp4SampleDuration = lastFrameDuration;
  596. }
  597. } else {
  598. mp4SampleDuration = lastFrameDuration;
  599. }
  600. }
  601. const compositionTimeOffset = Math.round(avcSample.pts - avcSample.dts);
  602.  
  603. outputSamples.push(
  604. new Mp4Sample(
  605. avcSample.key,
  606. mp4SampleDuration,
  607. mp4SampleLength,
  608. compositionTimeOffset
  609. )
  610. );
  611. }
  612.  
  613. if (outputSamples.length && chromeVersion && chromeVersion < 70) {
  614. // Chrome workaround, mark first sample as being a Random Access Point (keyframe) to avoid sourcebuffer append issue
  615. // https://code.google.com/p/chromium/issues/detail?id=229412
  616. const flags = outputSamples[0].flags;
  617. flags.dependsOn = 2;
  618. flags.isNonSync = 0;
  619. }
  620.  
  621. console.assert(
  622. mp4SampleDuration !== undefined,
  623. 'mp4SampleDuration must be computed'
  624. );
  625. // next AVC sample DTS should be equal to last sample DTS + last sample duration (in PES timescale)
  626. this.nextAvcDts = nextAvcDts = lastDTS + mp4SampleDuration;
  627. this.isVideoContiguous = true;
  628. const moof = MP4.moof(
  629. track.sequenceNumber++,
  630. firstDTS,
  631. Object.assign({}, track, {
  632. samples: outputSamples,
  633. })
  634. );
  635. const type: SourceBufferName = 'video';
  636. const data = {
  637. data1: moof,
  638. data2: mdat,
  639. startPTS: minPTS / timeScale,
  640. endPTS: (maxPTS + mp4SampleDuration) / timeScale,
  641. startDTS: firstDTS / timeScale,
  642. endDTS: (nextAvcDts as number) / timeScale,
  643. type,
  644. hasAudio: false,
  645. hasVideo: true,
  646. nb: outputSamples.length,
  647. dropped: track.dropped,
  648. };
  649.  
  650. track.samples = [];
  651. track.dropped = 0;
  652.  
  653. console.assert(mdat.length, 'MDAT length must not be zero');
  654.  
  655. return data;
  656. }
  657.  
  658. remuxAudio(
  659. track: DemuxedAudioTrack,
  660. timeOffset: number,
  661. contiguous: boolean,
  662. accurateTimeOffset: boolean,
  663. videoTimeOffset?: number
  664. ): RemuxedTrack | undefined {
  665. const inputTimeScale: number = track.inputTimeScale;
  666. const mp4timeScale: number = track.samplerate
  667. ? track.samplerate
  668. : inputTimeScale;
  669. const scaleFactor: number = inputTimeScale / mp4timeScale;
  670. const mp4SampleDuration: number = track.isAAC
  671. ? AAC_SAMPLES_PER_FRAME
  672. : MPEG_AUDIO_SAMPLE_PER_FRAME;
  673. const inputSampleDuration: number = mp4SampleDuration * scaleFactor;
  674. const initPTS: number = this._initPTS;
  675. const rawMPEG: boolean = !track.isAAC && this.typeSupported.mpeg;
  676. const outputSamples: Array<Mp4Sample> = [];
  677.  
  678. let inputSamples: Array<AudioSample> = track.samples;
  679. let offset: number = rawMPEG ? 0 : 8;
  680. let fillFrame: any;
  681. let nextAudioPts: number = this.nextAudioPts || -1;
  682.  
  683. // window.audioSamples ? window.audioSamples.push(inputSamples.map(s => s.pts)) : (window.audioSamples = [inputSamples.map(s => s.pts)]);
  684.  
  685. // for audio samples, also consider consecutive fragments as being contiguous (even if a level switch occurs),
  686. // for sake of clarity:
  687. // consecutive fragments are frags with
  688. // - less than 100ms gaps between new time offset (if accurate) and next expected PTS OR
  689. // - less than 20 audio frames distance
  690. // contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
  691. // this helps ensuring audio continuity
  692. // and this also avoids audio glitches/cut when switching quality, or reporting wrong duration on first audio frame
  693. const timeOffsetMpegTS = timeOffset * inputTimeScale;
  694. this.isAudioContiguous = contiguous =
  695. contiguous ||
  696. ((inputSamples.length &&
  697. nextAudioPts > 0 &&
  698. ((accurateTimeOffset &&
  699. Math.abs(timeOffsetMpegTS - nextAudioPts) < 9000) ||
  700. Math.abs(
  701. normalizePts(inputSamples[0].pts - initPTS, timeOffsetMpegTS) -
  702. nextAudioPts
  703. ) <
  704. 20 * inputSampleDuration)) as boolean);
  705.  
  706. // compute normalized PTS
  707. inputSamples.forEach(function (sample) {
  708. sample.pts = sample.dts = normalizePts(
  709. sample.pts - initPTS,
  710. timeOffsetMpegTS
  711. );
  712. });
  713.  
  714. if (!contiguous || nextAudioPts < 0) {
  715. // filter out sample with negative PTS that are not playable anyway
  716. // if we don't remove these negative samples, they will shift all audio samples forward.
  717. // leading to audio overlap between current / next fragment
  718. inputSamples = inputSamples.filter((sample) => sample.pts >= 0);
  719.  
  720. // in case all samples have negative PTS, and have been filtered out, return now
  721. if (!inputSamples.length) {
  722. return;
  723. }
  724.  
  725. if (videoTimeOffset === 0) {
  726. // Set the start to 0 to match video so that start gaps larger than inputSampleDuration are filled with silence
  727. nextAudioPts = 0;
  728. } else if (accurateTimeOffset) {
  729. // When not seeking, not live, and LevelDetails.PTSKnown, use fragment start as predicted next audio PTS
  730. nextAudioPts = Math.max(0, timeOffsetMpegTS);
  731. } else {
  732. // if frags are not contiguous and if we cant trust time offset, let's use first sample PTS as next audio PTS
  733. nextAudioPts = inputSamples[0].pts;
  734. }
  735. }
  736.  
  737. // If the audio track is missing samples, the frames seem to get "left-shifted" within the
  738. // resulting mp4 segment, causing sync issues and leaving gaps at the end of the audio segment.
  739. // In an effort to prevent this from happening, we inject frames here where there are gaps.
  740. // When possible, we inject a silent frame; when that's not possible, we duplicate the last
  741. // frame.
  742.  
  743. if (track.isAAC) {
  744. const maxAudioFramesDrift = this.config.maxAudioFramesDrift;
  745. for (let i = 0, nextPts = nextAudioPts; i < inputSamples.length; ) {
  746. // First, let's see how far off this frame is from where we expect it to be
  747. const sample = inputSamples[i];
  748. const pts = sample.pts;
  749. const delta = pts - nextPts;
  750. const duration = Math.abs((1000 * delta) / inputTimeScale);
  751.  
  752. // When remuxing with video, if we're overlapping by more than a duration, drop this sample to stay in sync
  753. if (
  754. delta <= -maxAudioFramesDrift * inputSampleDuration &&
  755. videoTimeOffset !== undefined
  756. ) {
  757. if (contiguous || i > 0) {
  758. logger.warn(
  759. `[mp4-remuxer]: Dropping 1 audio frame @ ${(
  760. nextPts / inputTimeScale
  761. ).toFixed(3)}s due to ${Math.round(duration)} ms overlap.`
  762. );
  763. inputSamples.splice(i, 1);
  764. // Don't touch nextPtsNorm or i
  765. } else {
  766. // When changing qualities we can't trust that audio has been appended up to nextAudioPts
  767. // Warn about the overlap but do not drop samples as that can introduce buffer gaps
  768. logger.warn(
  769. `Audio frame @ ${(pts / inputTimeScale).toFixed(
  770. 3
  771. )}s overlaps nextAudioPts by ${Math.round(
  772. (1000 * delta) / inputTimeScale
  773. )} ms.`
  774. );
  775. nextPts = pts + inputSampleDuration;
  776. i++;
  777. }
  778. } // eslint-disable-line brace-style
  779.  
  780. // Insert missing frames if:
  781. // 1: We're more than maxAudioFramesDrift frame away
  782. // 2: Not more than MAX_SILENT_FRAME_DURATION away
  783. // 3: currentTime (aka nextPtsNorm) is not 0
  784. // 4: remuxing with video (videoTimeOffset !== undefined)
  785. else if (
  786. delta >= maxAudioFramesDrift * inputSampleDuration &&
  787. duration < MAX_SILENT_FRAME_DURATION &&
  788. videoTimeOffset !== undefined
  789. ) {
  790. const missing = Math.floor(delta / inputSampleDuration);
  791. // Adjust nextPts so that silent samples are aligned with media pts. This will prevent media samples from
  792. // later being shifted if nextPts is based on timeOffset and delta is not a multiple of inputSampleDuration.
  793. nextPts = pts - missing * inputSampleDuration;
  794. logger.warn(
  795. `[mp4-remuxer]: Injecting ${missing} audio frame @ ${(
  796. nextPts / inputTimeScale
  797. ).toFixed(3)}s due to ${Math.round(
  798. (1000 * delta) / inputTimeScale
  799. )} ms gap.`
  800. );
  801. for (let j = 0; j < missing; j++) {
  802. const newStamp = Math.max(nextPts as number, 0);
  803. fillFrame = AAC.getSilentFrame(
  804. track.manifestCodec || track.codec,
  805. track.channelCount
  806. );
  807. if (!fillFrame) {
  808. logger.log(
  809. '[mp4-remuxer]: Unable to get silent frame for given audio codec; duplicating last frame instead.'
  810. );
  811. fillFrame = sample.unit.subarray();
  812. }
  813. inputSamples.splice(i, 0, {
  814. unit: fillFrame,
  815. pts: newStamp,
  816. dts: newStamp,
  817. });
  818. nextPts += inputSampleDuration;
  819. i++;
  820. }
  821.  
  822. // Adjust sample to next expected pts
  823. sample.pts = sample.dts = nextPts;
  824. nextPts += inputSampleDuration;
  825. i++;
  826. } else {
  827. // Otherwise, just adjust pts
  828. sample.pts = sample.dts = nextPts;
  829. nextPts += inputSampleDuration;
  830. i++;
  831. }
  832. }
  833. }
  834. let firstPTS: number | null = null;
  835. let lastPTS: number | null = null;
  836. let mdat: any;
  837. let mdatSize: number = 0;
  838. let sampleLength: number = inputSamples.length;
  839. while (sampleLength--) {
  840. mdatSize += inputSamples[sampleLength].unit.byteLength;
  841. }
  842. for (let j = 0, nbSamples = inputSamples.length; j < nbSamples; j++) {
  843. const audioSample = inputSamples[j];
  844. const unit = audioSample.unit;
  845. let pts = audioSample.pts;
  846. if (lastPTS !== null) {
  847. // If we have more than one sample, set the duration of the sample to the "real" duration; the PTS diff with
  848. // the previous sample
  849. const prevSample = outputSamples[j - 1];
  850. prevSample.duration = Math.round((pts - lastPTS) / scaleFactor);
  851. } else {
  852. const delta = Math.round(
  853. (1000 * (pts - nextAudioPts)) / inputTimeScale
  854. );
  855. let numMissingFrames = 0;
  856. // if fragment are contiguous, detect hole/overlapping between fragments
  857. // contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
  858. if (contiguous && track.isAAC) {
  859. if (delta > 0 && delta < MAX_SILENT_FRAME_DURATION) {
  860. numMissingFrames = Math.round(
  861. (pts - nextAudioPts) / inputSampleDuration
  862. );
  863. logger.log(
  864. `[mp4-remuxer]: ${delta} ms hole between AAC samples detected,filling it`
  865. );
  866. if (numMissingFrames > 0) {
  867. fillFrame = AAC.getSilentFrame(
  868. track.manifestCodec || track.codec,
  869. track.channelCount
  870. );
  871. if (!fillFrame) {
  872. fillFrame = unit.subarray();
  873. }
  874.  
  875. mdatSize += numMissingFrames * fillFrame.length;
  876. }
  877. // if we have frame overlap, overlapping for more than half a frame duraion
  878. } else if (delta < -12) {
  879. // drop overlapping audio frames... browser will deal with it
  880. logger.log(
  881. `[mp4-remuxer]: drop overlapping AAC sample, expected/parsed/delta:${(
  882. nextAudioPts / inputTimeScale
  883. ).toFixed(3)}s/${(pts / inputTimeScale).toFixed(3)}s/${-delta}ms`
  884. );
  885. mdatSize -= unit.byteLength;
  886. continue;
  887. }
  888. // set PTS/DTS to expected PTS/DTS
  889. pts = nextAudioPts;
  890. }
  891. // remember first PTS of our audioSamples
  892. firstPTS = pts;
  893. if (mdatSize > 0) {
  894. /* concatenate the audio data and construct the mdat in place
  895. (need 8 more bytes to fill length and mdat type) */
  896. mdatSize += offset;
  897. try {
  898. mdat = new Uint8Array(mdatSize);
  899. } catch (err) {
  900. this.observer.emit(Events.ERROR, Events.ERROR, {
  901. type: ErrorTypes.MUX_ERROR,
  902. details: ErrorDetails.REMUX_ALLOC_ERROR,
  903. fatal: false,
  904. bytes: mdatSize,
  905. reason: `fail allocating audio mdat ${mdatSize}`,
  906. });
  907. return;
  908. }
  909. if (!rawMPEG) {
  910. const view = new DataView(mdat.buffer);
  911. view.setUint32(0, mdatSize);
  912. mdat.set(MP4.types.mdat, 4);
  913. }
  914. } else {
  915. // no audio samples
  916. return;
  917. }
  918. for (let i = 0; i < numMissingFrames; i++) {
  919. fillFrame = AAC.getSilentFrame(
  920. track.manifestCodec || track.codec,
  921. track.channelCount
  922. );
  923. if (!fillFrame) {
  924. logger.log(
  925. '[mp4-remuxer]: Unable to get silent frame for given audio codec; duplicating the current frame instead'
  926. );
  927. fillFrame = unit.subarray();
  928. }
  929. mdat.set(fillFrame, offset);
  930. offset += fillFrame.byteLength;
  931. outputSamples.push(
  932. new Mp4Sample(true, AAC_SAMPLES_PER_FRAME, fillFrame.byteLength, 0)
  933. );
  934. }
  935. }
  936. mdat.set(unit, offset);
  937. const unitLen = unit.byteLength;
  938. offset += unitLen;
  939. // Default the sample's duration to the computed mp4SampleDuration, which will either be 1024 for AAC or 1152 for MPEG
  940. // In the case that we have 1 sample, this will be the duration. If we have more than one sample, the duration
  941. // becomes the PTS diff with the previous sample
  942. outputSamples.push(new Mp4Sample(true, mp4SampleDuration, unitLen, 0));
  943. lastPTS = pts;
  944. }
  945.  
  946. // We could end up with no audio samples if all input samples were overlapping with the previously remuxed ones
  947. const nbSamples = outputSamples.length;
  948. if (!nbSamples) {
  949. return;
  950. }
  951.  
  952. // The next audio sample PTS should be equal to last sample PTS + duration
  953. const lastSample = outputSamples[outputSamples.length - 1];
  954. this.nextAudioPts = nextAudioPts =
  955. lastPTS! + scaleFactor * lastSample.duration;
  956.  
  957. // Set the track samples from inputSamples to outputSamples before remuxing
  958. const moof = rawMPEG
  959. ? new Uint8Array(0)
  960. : MP4.moof(
  961. track.sequenceNumber++,
  962. firstPTS! / scaleFactor,
  963. Object.assign({}, track, { samples: outputSamples })
  964. );
  965.  
  966. // Clear the track samples. This also clears the samples array in the demuxer, since the reference is shared
  967. track.samples = [];
  968. const start = firstPTS! / inputTimeScale;
  969. const end = nextAudioPts / inputTimeScale;
  970. const type: SourceBufferName = 'audio';
  971. const audioData = {
  972. data1: moof,
  973. data2: mdat,
  974. startPTS: start,
  975. endPTS: end,
  976. startDTS: start,
  977. endDTS: end,
  978. type,
  979. hasAudio: true,
  980. hasVideo: false,
  981. nb: nbSamples,
  982. };
  983.  
  984. this.isAudioContiguous = true;
  985.  
  986. console.assert(mdat.length, 'MDAT length must not be zero');
  987. return audioData;
  988. }
  989.  
  990. remuxEmptyAudio(
  991. track: DemuxedAudioTrack,
  992. timeOffset: number,
  993. contiguous: boolean,
  994. videoData: Fragment
  995. ): RemuxedTrack | undefined {
  996. const inputTimeScale: number = track.inputTimeScale;
  997. const mp4timeScale: number = track.samplerate
  998. ? track.samplerate
  999. : inputTimeScale;
  1000. const scaleFactor: number = inputTimeScale / mp4timeScale;
  1001. const nextAudioPts: number | null = this.nextAudioPts;
  1002. // sync with video's timestamp
  1003. const startDTS: number =
  1004. (nextAudioPts !== null
  1005. ? nextAudioPts
  1006. : videoData.startDTS * inputTimeScale) + this._initDTS;
  1007. const endDTS: number = videoData.endDTS * inputTimeScale + this._initDTS;
  1008. // one sample's duration value
  1009. const frameDuration: number = scaleFactor * AAC_SAMPLES_PER_FRAME;
  1010. // samples count of this segment's duration
  1011. const nbSamples: number = Math.ceil((endDTS - startDTS) / frameDuration);
  1012. // silent frame
  1013. const silentFrame: Uint8Array | undefined = AAC.getSilentFrame(
  1014. track.manifestCodec || track.codec,
  1015. track.channelCount
  1016. );
  1017.  
  1018. logger.warn('[mp4-remuxer]: remux empty Audio');
  1019. // Can't remux if we can't generate a silent frame...
  1020. if (!silentFrame) {
  1021. logger.trace(
  1022. '[mp4-remuxer]: Unable to remuxEmptyAudio since we were unable to get a silent frame for given audio codec'
  1023. );
  1024. return;
  1025. }
  1026.  
  1027. const samples: Array<any> = [];
  1028. for (let i = 0; i < nbSamples; i++) {
  1029. const stamp = startDTS + i * frameDuration;
  1030. samples.push({ unit: silentFrame, pts: stamp, dts: stamp });
  1031. }
  1032. track.samples = samples;
  1033.  
  1034. return this.remuxAudio(track, timeOffset, contiguous, false);
  1035. }
  1036.  
  1037. remuxID3(
  1038. track: DemuxedMetadataTrack,
  1039. timeOffset: number
  1040. ): RemuxedMetadata | undefined {
  1041. const length = track.samples.length;
  1042. if (!length) {
  1043. return;
  1044. }
  1045. const inputTimeScale = track.inputTimeScale;
  1046. const initPTS = this._initPTS;
  1047. const initDTS = this._initDTS;
  1048. for (let index = 0; index < length; index++) {
  1049. const sample = track.samples[index];
  1050. // setting id3 pts, dts to relative time
  1051. // using this._initPTS and this._initDTS to calculate relative time
  1052. sample.pts =
  1053. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1054. inputTimeScale;
  1055. sample.dts =
  1056. normalizePts(sample.dts - initDTS, timeOffset * inputTimeScale) /
  1057. inputTimeScale;
  1058. }
  1059. const samples = track.samples;
  1060. track.samples = [];
  1061. return {
  1062. samples,
  1063. };
  1064. }
  1065.  
  1066. remuxText(
  1067. track: DemuxedUserdataTrack,
  1068. timeOffset: number
  1069. ): RemuxedUserdata | undefined {
  1070. const length = track.samples.length;
  1071. if (!length) {
  1072. return;
  1073. }
  1074.  
  1075. const inputTimeScale = track.inputTimeScale;
  1076. const initPTS = this._initPTS;
  1077. for (let index = 0; index < length; index++) {
  1078. const sample = track.samples[index];
  1079. // setting text pts, dts to relative time
  1080. // using this._initPTS and this._initDTS to calculate relative time
  1081. sample.pts =
  1082. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1083. inputTimeScale;
  1084. }
  1085. track.samples.sort((a, b) => a.pts - b.pts);
  1086. const samples = track.samples;
  1087. track.samples = [];
  1088. return {
  1089. samples,
  1090. };
  1091. }
  1092. }
  1093.  
  1094. export function normalizePts(value: number, reference: number | null): number {
  1095. let offset;
  1096. if (reference === null) {
  1097. return value;
  1098. }
  1099.  
  1100. if (reference < value) {
  1101. // - 2^33
  1102. offset = -8589934592;
  1103. } else {
  1104. // + 2^33
  1105. offset = 8589934592;
  1106. }
  1107. /* PTS is 33bit (from 0 to 2^33 -1)
  1108. if diff between value and reference is bigger than half of the amplitude (2^32) then it means that
  1109. PTS looping occured. fill the gap */
  1110. while (Math.abs(value - reference) > 4294967296) {
  1111. value += offset;
  1112. }
  1113.  
  1114. return value;
  1115. }
  1116.  
  1117. function findKeyframeIndex(samples: Array<AvcSample>): number {
  1118. for (let i = 0; i < samples.length; i++) {
  1119. if (samples[i].key) {
  1120. return i;
  1121. }
  1122. }
  1123. return -1;
  1124. }
  1125.  
  1126. class Mp4Sample {
  1127. public size: number;
  1128. public duration: number;
  1129. public cts: number;
  1130. public flags: Mp4SampleFlags;
  1131.  
  1132. constructor(isKeyframe: boolean, duration, size, cts) {
  1133. this.duration = duration;
  1134. this.size = size;
  1135. this.cts = cts;
  1136. this.flags = new Mp4SampleFlags(isKeyframe);
  1137. }
  1138. }
  1139.  
  1140. class Mp4SampleFlags {
  1141. public isLeading: 0 = 0;
  1142. public isDependedOn: 0 = 0;
  1143. public hasRedundancy: 0 = 0;
  1144. public degradPrio: 0 = 0;
  1145. public dependsOn: 1 | 2 = 1;
  1146. public isNonSync: 0 | 1 = 1;
  1147.  
  1148. constructor(isKeyframe) {
  1149. this.dependsOn = isKeyframe ? 2 : 1;
  1150. this.isNonSync = isKeyframe ? 0 : 1;
  1151. }
  1152. }